Load Libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite('compare')
library(digest)
library(methylumi)
library(lumi)
library(minfi)
library(minfiData)
library(gplots)
library(reshape2)
library(ggplot2)
library("RColorBrewer")
library(gplots)
library(gridExtra)
library(ggplot2)
library(grid)
library(lattice)
library(compare)
library(limma)
library(matrixStats)
Read in Design matrix
setwd("Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort")
##Read in Phenotypic data
des<-read.csv('Design_matrix_WPR_2015.csv')
str(des)
## 'data.frame': 102 obs. of 17 variables:
## $ ParticipantID : Factor w/ 102 levels "PL102","PL104",..: 5 25 21 4 13 26 15 14 22 20 ...
## $ group : Factor w/ 6 levels "EOPE","IUGR",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ IUGR : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
## $ Ethnicity : Factor w/ 10 levels "Asian","Caucasian",..: NA NA NA NA NA NA NA NA NA NA ...
## $ MA : num 38.7 23.7 37.3 25.4 34.7 35.8 26.6 35.9 26.9 27.8 ...
## $ Sex : Factor w/ 2 levels "FEMALE","MALE": 2 1 1 2 2 2 2 2 2 1 ...
## $ GA : num 25 26 28 28 28.6 31.4 32 32.4 32.4 32.6 ...
## $ BW : int 978 758 1200 1245 1455 1849 1810 NA 1795 2100 ...
## $ BW_SD : num 0.66 -0.5 0.56 0.86 1.7 0.99 0.82 NA 0.74 0.69 ...
## $ PW : int 286 265 305 218 230 360 360 430 300 500 ...
## $ F_PL : num 3.42 2.86 3.93 5.71 6.33 5.14 5.03 NA 5.98 4.2 ...
## $ PL_length : num NA NA NA NA NA NA NA NA NA NA ...
## $ PL_breadth : num NA NA NA NA NA NA NA NA NA NA ...
## $ Lgth_Bdth : num NA NA NA NA NA NA NA NA NA NA ...
## $ Plate : Factor w/ 5 levels "WG0003252-MSA2",..: 1 5 3 3 5 1 1 1 2 1 ...
## $ Sentrix_ID : num 6.04e+09 9.98e+09 7.97e+09 7.97e+09 1.00e+10 ...
## $ Sentrix_Position: Factor w/ 12 levels "R01C01","R01C02",..: 9 12 3 12 5 3 5 3 3 9 ...
des$Sentrix_ID<-as.factor(des$Sentrix_ID)
Reading in IDAT Files
##Read in the raw IDAT Files
setwd("Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort")
path <- "IDAT Files"
list.files(path)
list.files(file.path(path, "7970368036"))
#targets <- read.450k.sheet(path)
targets <- read.metharray.sheet(path)
baseDir <- system.file(path, package = "minfiData")
sub(baseDir, "", targets$Basename)
##If there is a character(0) within the files, you have a incorrect IDAT file, go back and double check
##Had issues with exceeding memory limit in this next step
## use this function if you have issues memory.limit(size=)
RGset <- read.metharray.exp(targets = targets, verbose = TRUE)
RGset
pd <- pData(RGset)
colnames(pd) <- gsub("X", "sampleName", colnames(pd))
Quality Control Checks
qcReport(RGset, sampNames = pd$sampleName, sampGroups = pd$cell, pdf = "minfi_qcReport.pdf")
## Warning: NON-POLYMORPHIC probes outside plot range
## png
## 2
##Some plots of the quality of the data
densityPlot(RGset, sampGroups = pd$group,main = "Beta", xlab = "Beta")
Raw,Genome Studio equivalent and swan normalization ofr comparison For comparison, make a raw, Illumina, and SWAN preprocessed datasets
MSet.raw <- preprocessRaw(RGset)
MSet.raw <- MSet.raw[order(featureNames(MSet.raw)), ]
MSet.norm <- preprocessIllumina(RGset, bg.correct = TRUE, normalize = "no")
MSet.norm <- MSet.norm[order(featureNames(MSet.norm)), ]
MSet.swan <- preprocessSWAN(RGset, mSet = MSet.norm, verbose = TRUE)
MSet.swan <- MSet.swan[order(featureNames(MSet.swan)), ]
Functional normalization
MSet.fnorm <- preprocessFunnorm(RGset, nPCs = 2, sex = NULL, bgCorr = TRUE, dyeCorr = TRUE, verbose = TRUE)
## [preprocessFunnorm] Background and dye bias correction with noob
## [preprocessNoob] Applying R/G ratio flip to fix dye bias...
## [preprocessFunnorm] Mapping to genome
## [preprocessFunnorm] Quantile extraction
## [preprocessFunnorm] Normalization
# functional normalization with background and dye bias correction using noob
MSet.fnorm <- MSet.fnorm[order(featureNames(MSet.fnorm)), ]
MSet.fnorm
## class: GenomicRatioSet
## dim: 485512 102
## metadata(0):
## assays(2): Beta CN
## rownames(485512): cg00000029 cg00000108 ... ch.X.97737721F
## ch.X.98007042R
## rowData names(0):
## colnames(102): 6042324020_R05C01 9977525015_R06C02 ...
## 6042308143_R06C02 9296930098_R05C02
## colData names(20): ParticipantID group ... filenames predictedSex
## Annotation
## array: IlluminaHumanMethylation450k
## annotation: ilmn12.hg19
## Preprocessing
## Method: NA
## minfi version: NA
## Manifest version: NA
##save project
##save(MSet.fnorm, file = "Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort/Mset.fnorm_Jan2016.RData")
More Quality Control
##Checking that all samples have the same sex as the records record
sex <- getSex(MSet.fnorm)
plotSex(sex)
##All samples plotted to the correct fetal sex
Assessing Illumina Normalization (GenomeStudio)
qc <- getQC(MSet.norm)
plot(as.matrix(getQC(MSet.norm)))
Looking at sample groups with funNorm
##set colour palette
group.col <- c("purple4", "dodgerblue2", "turquoise1", "darkorange", "violetred")
mdsPlot(getBeta(MSet.fnorm), numPositions = 485512, sampGroups = pd$group, sampNames = pd$sampleName, pal = group.col, legendPos = "topright")
mdsPlot(getBeta(MSet.fnorm), numPositions = 100000, sampGroups = pd$group, sampNames = pd$sampleName, pal = group.col,legendPos = "topright")
mdsPlot(getBeta(MSet.fnorm), numPositions = 10000, sampGroups = pd$group, sampNames = pd$sampleName, pal = group.col,legendPos = "topright")
mdsPlot(getBeta(MSet.fnorm), numPositions = 1000, sampGroups = pd$group, sampNames = pd$sampleName, pal = group.col,legendPos = "topright")
Comparing Normalization methods
all(featureNames(MSet.raw) == featureNames(MSet.norm))
## [1] TRUE
all(featureNames(MSet.raw) == featureNames(MSet.fnorm))
## [1] TRUE
all(featureNames(MSet.raw) == featureNames(MSet.swan))
## [1] TRUE
probeTypes <- data.frame(Name = featureNames(MSet.raw),
Type = getProbeType(MSet.raw))
par(mfrow = c(2, 2))
plotBetasByType(MSet.raw[,1], main = "Raw")
plotBetasByType(MSet.norm[,1], main = "GS_norm")
plotBetasByType(MSet.swan[,1], main = "SWAN")
plotBetasByType(getBeta(MSet.fnorm[,1]), probeTypes = probeTypes, main = "funNorm_noob")
##By the looks of things here functional normalization appears to do a better job at normalizing my samples, with the type 1 and type 2 probes being closer together
(v.chp.col<-as.vector(pd$Slide))
(v.chp.col<-gsub("6042308147","black",v.chp.col))
(v.chp.col<-gsub("6042324020","green",v.chp.col))
(v.chp.col<-gsub("7970368014","purple",v.chp.col))
(v.chp.col<-gsub("7970368023","red",v.chp.col))
(v.chp.col<-gsub("7970368036","blue",v.chp.col))
(v.chp.col<-gsub("7970368050","yellow",v.chp.col))
(v.chp.col<-gsub("7970368054","orange",v.chp.col))
(v.chp.col<-gsub("7970368062","grey",v.chp.col))
(v.chp.col<-gsub("7970368066","darkred",v.chp.col))
(v.chp.col<-gsub("7970368076","lightgreen",v.chp.col))
(v.chp.col<-gsub("7970368097","darkblue",v.chp.col))
(v.chp.col<-gsub("7970368112","white",v.chp.col))
(v.chp.col<-gsub("7970368142","pink",v.chp.col))
(v.chp.col<-gsub("7973201026","lightyellow",v.chp.col))
(v.chp.col<-gsub("7973201038","lightblue",v.chp.col))
(v.chp.col<-gsub("9266441046","black",v.chp.col))
(v.chp.col<-gsub("9266441156","green",v.chp.col))
(v.chp.col<-gsub("9285451020","purple",v.chp.col))
(v.chp.col<-gsub("9285451059","red",v.chp.col))
(v.chp.col<-gsub("9296930098","blue",v.chp.col))
(v.chp.col<-gsub("9296930103","yellow",v.chp.col))
(v.chp.col<-gsub("9296930123","orange",v.chp.col))
(v.chp.col<-gsub("9977525013","grey",v.chp.col))
(v.chp.col<-gsub("9977525015","darkred",v.chp.col))
(v.chp.col<-gsub("10005833024","lightgreen",v.chp.col))
(v.chp.col<-gsub("10005833037","darkblue",v.chp.col))
(v.chp.col<-gsub("10005833038","white",v.chp.col))
(v.chp.col<-gsub("6042308143","pink",v.chp.col))
(v.chp.col<-gsub("7970368100","lightblue",v.chp.col))
v.chp.col
(v.grp.col<-as.vector(pd$group))
(v.grp.col<-gsub("Term","black",v.grp.col))
(v.grp.col<-gsub("PreT","chocolate4",v.grp.col))
(v.grp.col<-gsub("LOPE","blue",v.grp.col))
(v.grp.col<-gsub("IUGR","goldenrod",v.grp.col))
(v.grp.col<-gsub("REPLICATE","red",v.grp.col))
(v.grp.col<-gsub("EOPE","lightseagreen",v.grp.col))
v.grp.col
Looking at batch effects with heatmap
##colour
grey <- colorRampPalette(brewer.pal(n = 9, "Greys"))
library(gplots)
cor.raw <- cor(getBeta(MSet.raw), use = "pairwise.complete.obs")
rownames(cor.raw) <- pd$ParticipantID
heatmap.2(cor.raw, main = "MSet.raw, no SNPs - 485,512 probes",
trace = "none", col = grey, dendrogram = "row",
RowSideColors = v.grp.col, cexRow = 0.5,
ColSideColors = v.chp.col, cexCol = 0.9, keysize = 1, margins = c(20,20),
key = TRUE)
legend("bottomright",bty="n",title="Side bar colours",c("Term","PreT", "LOPE","IUGR","REPLICATE","EOPE"),
fill=c("black","chocolate4","blue","goldenrod","red","lightseagreen"),ncol=3)
cor.norm <- cor(getBeta(MSet.norm), use = "pairwise.complete.obs")
rownames(cor.norm) <- pd$ParticipantID
heatmap.2(cor.norm, main = "MSet.norm, no SNPs - 485,512 probes",
trace = "none", col = grey, dendrogram = "row",
RowSideColors = v.grp.col, cexRow = 0.5,
ColSideColors = v.chp.col, cexCol = 0.9, keysize = 1, margins = c(20,20))
legend("bottomright",bty="n",title="Side bar colours",c("Term","PreT", "LOPE","IUGR","REPLICATE","EOPE"),
fill=c("black","chocolate4","blue","goldenrod","red","lightseagreen"),ncol=3)
cor.swan <- cor(getBeta(MSet.swan), use = "pairwise.complete.obs")
rownames(cor.swan) <- pd$ParticipantID
heatmap.2(cor.swan, main = "MSet.swan, no SNPs - 485,512 probes",
trace = "none", col = grey, dendrogram = "row",
RowSideColors = v.grp.col, cexRow = 0.5,
ColSideColors = v.chp.col, cexCol = 0.9, keysize = 1, margins = c(20,20))
legend("bottomright",bty="n",title="Side bar colours",c("Term","PreT", "LOPE","IUGR","REPLICATE","EOPE"),
fill=c("black","chocolate4","blue","goldenrod","red","lightseagreen"),ncol=3)
cor.fnorm <- cor(getBeta(MSet.fnorm), use = "pairwise.complete.obs")
rownames(cor.fnorm) <- pd$ParticipantID
heatmap.2(cor.fnorm, main = "MSet.fnorm, no SNPs - 485,512 probes",
trace = "none", col = grey, dendrogram = "row",
RowSideColors = v.grp.col, cexRow = 0.5,
ColSideColors = v.chp.col, cexCol = 0.9, keysize = 1, margins = c(20,20))
legend("bottomright",bty="n",title="Side bar colours",c("Term","PreT", "LOPE","IUGR","REPLICATE","EOPE"),
fill=c("black","chocolate4","blue","goldenrod","red","lightseagreen"),ncol=3)
Taking m values from Functional Normalization and putting into methylumi PROJECT First make a methylumi object with all samples
setwd('Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort')
##allFile <- file.choose() ## From GenomeStudio: all samples, all columns (all illumina annotation information)
# + average beta, detection Pval, signal A, signal B
##betaFile <- file.choose() ## From GenomeStudio: all the samples, columns
# + average beta
##qcFile <- file.choose() ## From GenomeStudio: under Control Probes Profile - all samples, all columns
# + Signal_Grn, Signal_Red, Detection Pval
load('PROJECT.original_nofilter_Jan2016.RData')
load('PROJECT.2.original_nofilter_Jan2016.RData')
##PROJECT<-lumiMethyR(allFile)
##PROJECT.2 <- methylumiR(betaFile,qcfile=qcFile)
# DataSummary
PROJECT#110 samples * 485,577 features
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 485577 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PM139 PL11 ... PM306r (110 total)
## varLabels: sampleID label
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (485577 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
#sampleNames(PROJECT)
##save(PROJECT,file="PROJECT.original_nofilter_Jan2016.RData")
##save(PROJECT.2,file="PROJECT.2.original_nofilter_Jan2016.RData")
Organizing Design Matrix file
##reading in Phenotype data
des<-read.csv("Design_matrix_WPR_from540_final.csv",header=T)
rownames(des)<-des$ParticipantID
#str(des)
#Removing unneeded phenotypic data (mostly incomplete data)
des$Sentrix_ID<-as.factor(des$Sentrix_ID)
des$F_PL<-NULL
des$PW<-NULL
des$PL_length<-NULL
des$PL_breadth<-NULL
des$Lgth_Bdth<-NULL
des$Ethnicity<-NULL
des$BW<-NULL
#str(des)
all(sampleNames(PROJECT)%in% rownames(des)) #TRUE
## [1] TRUE
stopifnot(all(sampleNames(PROJECT)%in% rownames(des))) #TRUE
Des <- des[sampleNames(PROJECT),] # *** must reorder des so that sampleNames & des are in same order!!
stopifnot(all(sampleNames(PROJECT)%in% rownames(Des))) #TRUE
#sampleNames(PROJECT)
#rownames(Des) # do a visual check of sample names
pData(PROJECT) <- des
PROJECT <- PROJECT[, order(sampleNames(PROJECT))] # reorder by sample names
#sampleNames(PROJECT)<-rownames(des)
#organize PROJECT.2
PROJECT.2 <- PROJECT.2[,order(sampleNames(PROJECT.2))]
sampleNames(PROJECT.2) # do a visual check of sample names in PROJECT & PROJECT.2
## [1] "PL102" "PL104" "PL11" "PL112" "PL113" "PL130" "PL131"
## [8] "PL135" "PL142" "PL145" "PL21" "PL21r" "PL21r2" "PL25"
## [15] "PL26" "PL32" "PL33" "PL38" "PL43" "PL56" "PL58"
## [22] "PL59" "PL64" "PL64r1" "PL64r2" "PL65" "PL72" "PL76"
## [29] "PL86" "PL96" "PM112" "PM114" "PM115" "PM116" "PM119"
## [36] "PM12" "PM120" "PM121" "PM122" "PM123" "PM129" "PM130"
## [43] "PM136" "PM138" "PM139" "PM139r1" "PM139r2" "PM142" "PM15"
## [50] "PM153" "PM158" "PM161" "PM167" "PM17" "PM20" "PM205"
## [57] "PM21" "PM226" "PM227" "PM249" "PM252" "PM256" "PM263"
## [64] "PM269" "PM272" "PM275" "PM285" "PM29" "PM30" "PM306"
## [71] "PM306r" "PM307" "PM31" "PM313" "PM32" "PM320" "PM321"
## [78] "PM35" "PM36" "PM38" "PM39" "PM4" "PM40" "PM41"
## [85] "PM43" "PM44" "PM46" "PM47" "PM49" "PM51" "PM52"
## [92] "PM53" "PM54" "PM55" "PM58" "PM6" "PM64" "PM66"
## [99] "PM67" "PM71" "PM72" "PM72r" "PM74" "PM77" "PM80"
## [106] "PM86" "PM87" "PM97" "PM98" "PM99"
#check that sample names & feature names are the same in PROJECT & PROJECT.2
all(featureNames(PROJECT)%in%featureNames(PROJECT.2)) ## Must be TRUE
## [1] TRUE
all(sampleNames(PROJECT)%in%sampleNames(PROJECT.2)) ## Must be TRUE
## [1] TRUE
Bad detection p values
badDetP <- detection(PROJECT)>0.01
nbadDetP <- print(sum(rowSums(badDetP)>=5))##1250 # Number of probes with at least one bad detectionP --2443
## [1] 1250
tbadDetP<-print(sum(badDetP)) # total number of NAs -- 32581
## [1] 32581
nbadDetP.t<-cbind(colSums(badDetP),as.character(PROJECT$Plate))
str(nbadDetP.t)
## chr [1:110, 1:2] "128" "426" "639" "265" "127" "331" ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:110] "PM39" "PM43" "PM139" "PL113" ...
## ..$ : NULL
#write.table(nbadDetP.t,file='bad dectection p vals_Jan2016.txt')
Missing Beta Values
avgbeta <- betas(PROJECT.2)[featureNames(PROJECT),sampleNames(PROJECT)]
badAvgbeta <- is.na(avgbeta)
nbadAvgbeta <- print(sum(rowSums(badAvgbeta)>=1))# Number of probes with at least one no avgbeta -- 70,814
## [1] 70814
tbadAvgbeta<-print(sum(badAvgbeta)) # total number of NAs -- 98199
## [1] 98199
nbadAvgbeta.t<-cbind(colSums(badAvgbeta), as.character(PROJECT$Plate))
str(nbadAvgbeta.t)
## chr [1:110, 1:2] "995" "702" "433" "567" "689" "800" ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:110] "PL102" "PL104" "PL11" "PL112" ...
## ..$ : NULL
#write.table(nbadAvgbeta.t,file='missing betas_all samples_Jan2016.txt')
Filtering Probes-bad probes (missing betas and bad detetion p vals)
# Set numbers for bad probes
(nSamples<-length(sampleNames(PROJECT)))
## [1] 110
(Per5<-5)
## [1] 5
## bad detection p value (>0.01)
badDetP <- detection(PROJECT)>0.01
nbadDetP <- print(sum(rowSums(badDetP)>=5)) # Number of probes ---- 1250- this is a very strinent criteria, to cut down the amount of probes I'm removing I will only take samples in which detection p val is>0.01 in >20% of samples
## [1] 1250
## missing beta values
betas.NA <- betas(PROJECT.2)[featureNames(PROJECT.2),sampleNames(PROJECT.2)]
#head(betas.NA)
badAvgbeta<- is.na(betas.NA)
nbadAvgbeta <- print(sum(rowSums(badAvgbeta)>Per5))# Number of probes ---- 705
## [1] 705
##only 705 probes are missing beta values in >5% of my samples
# total number of bad probes
badProbes <- rowSums(badDetP)>=Per5|rowSums(badAvgbeta)>=Per5 ## denotes that we're removing any probe with either a bad average beta or a bad detection P values in more than 5% of samples
sum(badProbes)# Number of probes that will be removed ------ 2294
## [1] 2294
PROJECT.filt <- PROJECT[!badProbes,] #removes all badProbes
dim(PROJECT.filt) #110 samples, 483283 probes
## Features Samples
## 483283 110
##save a version of the project with no NAs
PROJECT.noNA<-PROJECT.filt
PROJECT.2.filt<-PROJECT.2[!badProbes,]
dim(PROJECT.2.filt)
## Features Samples
## 483283 110
Any detection p values>0.01, and <5% percent of samples become NA Putting NAs from PROJECT.2.filt into PROJECT.filt
sum(is.na(exprs(PROJECT.filt)))##0
## [1] 0
sum((detection(PROJECT.filt)>0.01))##6541 probes with >0.01 reading
## [1] 6541
sum(is.na(betas(PROJECT.2)))##98199- total number of sites with NAs
## [1] 98199
##Putting in NAs
exprs(PROJECT.filt)[is.na(betas(PROJECT.2.filt))]<-NA
exprs.NA <- exprs(PROJECT.filt)[featureNames(PROJECT.filt),sampleNames(PROJECT.filt)]
exprs.badAvgbeta<- is.na(exprs.NA)
exprs.nbadAvgbeta <- print(sum(rowSums(exprs.badAvgbeta)>=1)) #69400- slightly less than above, because we took out bad probes
## [1] 69400
stopifnot(length(nbadAvgbeta)==length(exprs.nbadAvgbeta))
exprs.tbadAvgbeta<-print(sum(exprs.badAvgbeta))# total number of NAs in M values-- 89031
## [1] 89031
# Transfer NAs into M values for probes with bad detection pvalues
sum(badDetP <- detection(PROJECT.filt)>0.01) #6541 (same as above)
## [1] 6541
exprs(PROJECT.filt)[(detection(PROJECT.filt)>0.01)]<-NA
exprs.NA <- exprs(PROJECT.filt)[featureNames(PROJECT.filt),sampleNames(PROJECT.filt)]
exprs.NA.T<- is.na(exprs.NA) # matrix of T/F is the M value an NA?
exprs.n <- print(sum(rowSums(exprs.NA.T)>=1))#number of probes with >=1 NA -- 71974
## [1] 71974
print(sum(exprs.NA.T))# total number of NAs -- 95505
## [1] 95434
# Sanity check that NAs were transferred
sum(is.na(exprs(PROJECT.filt))) # 95505
## [1] 95434
##PROJECT with PROJECT.2 NAs in place- be used in any statistical analysis
PROJECT.NA<-PROJECT.filt
Filter XY probes,SNP probes, and cross hybridizing probes
##remove XY chromosome probes PROJECT.noNA
PROJECT.xy <- PROJECT.noNA[fData(PROJECT.noNA)$CHR%in%c("X","Y"),]
PROJECT.xy #n=11,273
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 11273 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00005617 cg00006815 ... ch.X.98007042R (11273
## total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
PROJECT.noNA.noXY<- PROJECT.noNA[!fData(PROJECT.noNA)$CHR%in%c("X","Y"),]
PROJECT.noNA.noXY #110 samples, 472010 probes
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 472010 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (472010 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
##PROJECT.NA
PROJECT.xy <- PROJECT.NA[fData(PROJECT.NA)$CHR%in%c("X","Y"),]
PROJECT.xy #n=11,273
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 11273 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00005617 cg00006815 ... ch.X.98007042R (11273
## total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
PROJECT.NA.noXY<- PROJECT.NA[!fData(PROJECT.NA)$CHR%in%c("X","Y"),]
PROJECT.NA.noXY #110 samples, 472010 probes
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 472010 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (472010 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
##PROJECT.2.filt
PROJECT.xy <- PROJECT.2.filt[fData(PROJECT.2.filt)$CHR%in%c("X","Y"),]
PROJECT.xy #n=11,273
##
## Object Information:
## MethyLumiSet (storageMode: lockedEnvironment)
## assayData: 11273 features, 110 samples
## element names: betas
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: sampleID label
## varMetadata: labelDescription
## featureData
## featureNames: cg00005617 cg00006815 ... ch.X.98007042R (11273
## total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
## Major Operation History:
## submitted finished
## 1 2016-01-13 14:35:54 2016-01-13 14:40:25
## 2 2016-11-29 17:24:26 2016-11-29 17:24:28
## 3 2016-11-29 17:24:47 2016-11-29 17:24:50
## 4 2016-11-29 17:25:34 2016-11-29 17:25:34
## command
## 1 methylumiR(filename = betaFile, qcfile = qcFile)
## 2 Subset of 110 samples.
## 3 Subset of 483283 features.
## 4 Subset of 11273 features.
PROJECT.2.filt.noXY<- PROJECT.2.filt[!fData(PROJECT.2.filt)$CHR%in%c("X","Y"),]
PROJECT.2.filt.noXY #110 samples, 472010 probes
##
## Object Information:
## MethyLumiSet (storageMode: lockedEnvironment)
## assayData: 472010 features, 110 samples
## element names: betas
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: sampleID label
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (472010 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
## Major Operation History:
## submitted finished
## 1 2016-01-13 14:35:54 2016-01-13 14:40:25
## 2 2016-11-29 17:24:26 2016-11-29 17:24:28
## 3 2016-11-29 17:24:47 2016-11-29 17:24:50
## 4 2016-11-29 17:25:34 2016-11-29 17:25:37
## command
## 1 methylumiR(filename = betaFile, qcfile = qcFile)
## 2 Subset of 110 samples.
## 3 Subset of 483283 features.
## 4 Subset of 472010 features.
Filter Polymorphic Probes
##PROJECT.noNA
fvarLabels(PROJECT.noNA.noXY)
## [1] "MAPINF0-1" "MAPINFO+1"
## [3] "Probe_start" "Probe_end"
## [5] "SNPCpG" "n_SNPCpG"
## [7] "SNPprobe" "n_SNPprobe"
## [9] "HIL_CpG_class" "HIL_CpG_Island_Name"
## [11] "n_bp_repetitive" "AlleleA_Hits"
## [13] "AlleleB_Hits" "XY_Hits"
## [15] "Autosomal_Hits" "Closest_TSS"
## [17] "Closest_TSS_1" "Distance_closest_TSS"
## [19] "Closest_TSS_gene_name" "Closest_TSS_Transcript"
## [21] "firstexon5UTRnHits" "firstexon5UTRAccessions"
## [23] "firstexonBodynHits" "firstexonBodyAccessions"
## [25] "firstexon3UTRnHits" "firstexon3UTRAccessions"
## [27] "exon5UTRnHits" "exon5UTRAccessions"
## [29] "exonBodynHits" "exonBodyAccessions"
## [31] "exon3UTRnHits" "exon3UTRAccessions"
## [33] "intron5UTRnHits" "intron5UTRAccessions"
## [35] "intronBodynHits" "intronBodyAccessions"
## [37] "intron3UTRnHits" "intron3UTRAccessions"
## [39] "IlluminaAccession" "firstexon5UTRnHitsPerGene"
## [41] "firstexon5UTRGene" "firstexonBodynHitsPerGene"
## [43] "firstexonBodyGene" "firstexon3UTRnHitsPerGene"
## [45] "firstexon3UTRGene" "exon5UTRnHitsPerGene"
## [47] "exon5UTRGene" "exonBodynHitsPerGene"
## [49] "exonBodyGene" "exon3UTRnHitsPerGene"
## [51] "exon3UTRGene" "intron5UTRnHitsPerGene"
## [53] "intron5UTRGene" "intronBodynHitsPerGene"
## [55] "intronBodyGene" "intron3UTRnHitsPerGene"
## [57] "intron3UTRGene" "IlluminaGene"
## [59] "Index" "TargetID"
## [61] "ProbeID_A" "ProbeID_B"
## [63] "ILMNID" "NAME"
## [65] "ADDRESSA_ID" "ALLELEA_PROBESEQ"
## [67] "ADDRESSB_ID" "ALLELEB_PROBESEQ"
## [69] "INFINIUM_DESIGN_TYPE" "NEXT_BASE"
## [71] "COLOR_CHANNEL" "FORWARD_SEQUENCE"
## [73] "GENOME_BUILD" "CHR"
## [75] "MAPINFO" "SOURCESEQ"
## [77] "CHROMOSOME_36" "COORDINATE_36"
## [79] "STRAND" "PROBE_SNPS"
## [81] "PROBE_SNPS_10" "RANDOM_LOCI"
## [83] "METHYL27_LOCI" "UCSC_REFGENE_NAME"
## [85] "UCSC_REFGENE_ACCESSION" "UCSC_REFGENE_GROUP"
## [87] "UCSC_CPG_ISLANDS_NAME" "RELATION_TO_UCSC_CPG_ISLAND"
## [89] "PHANTOM" "DMR"
## [91] "ENHANCER" "HMM_ISLAND"
## [93] "REGULATORY_FEATURE_NAME" "REGULATORY_FEATURE_GROUP"
## [95] "DHS"
PROJECT.noNA.noXY.noSNP<-PROJECT.noNA.noXY[substring(fData(PROJECT.noNA.noXY)[,5], 1,2)!="rs",] #removes all SNP probes
PROJECT.noNA.noXY.noSNP #110 samples, 453,182 probes
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 451806 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (451806 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
##PROJECT.NA
fvarLabels(PROJECT.NA.noXY)
## [1] "MAPINF0-1" "MAPINFO+1"
## [3] "Probe_start" "Probe_end"
## [5] "SNPCpG" "n_SNPCpG"
## [7] "SNPprobe" "n_SNPprobe"
## [9] "HIL_CpG_class" "HIL_CpG_Island_Name"
## [11] "n_bp_repetitive" "AlleleA_Hits"
## [13] "AlleleB_Hits" "XY_Hits"
## [15] "Autosomal_Hits" "Closest_TSS"
## [17] "Closest_TSS_1" "Distance_closest_TSS"
## [19] "Closest_TSS_gene_name" "Closest_TSS_Transcript"
## [21] "firstexon5UTRnHits" "firstexon5UTRAccessions"
## [23] "firstexonBodynHits" "firstexonBodyAccessions"
## [25] "firstexon3UTRnHits" "firstexon3UTRAccessions"
## [27] "exon5UTRnHits" "exon5UTRAccessions"
## [29] "exonBodynHits" "exonBodyAccessions"
## [31] "exon3UTRnHits" "exon3UTRAccessions"
## [33] "intron5UTRnHits" "intron5UTRAccessions"
## [35] "intronBodynHits" "intronBodyAccessions"
## [37] "intron3UTRnHits" "intron3UTRAccessions"
## [39] "IlluminaAccession" "firstexon5UTRnHitsPerGene"
## [41] "firstexon5UTRGene" "firstexonBodynHitsPerGene"
## [43] "firstexonBodyGene" "firstexon3UTRnHitsPerGene"
## [45] "firstexon3UTRGene" "exon5UTRnHitsPerGene"
## [47] "exon5UTRGene" "exonBodynHitsPerGene"
## [49] "exonBodyGene" "exon3UTRnHitsPerGene"
## [51] "exon3UTRGene" "intron5UTRnHitsPerGene"
## [53] "intron5UTRGene" "intronBodynHitsPerGene"
## [55] "intronBodyGene" "intron3UTRnHitsPerGene"
## [57] "intron3UTRGene" "IlluminaGene"
## [59] "Index" "TargetID"
## [61] "ProbeID_A" "ProbeID_B"
## [63] "ILMNID" "NAME"
## [65] "ADDRESSA_ID" "ALLELEA_PROBESEQ"
## [67] "ADDRESSB_ID" "ALLELEB_PROBESEQ"
## [69] "INFINIUM_DESIGN_TYPE" "NEXT_BASE"
## [71] "COLOR_CHANNEL" "FORWARD_SEQUENCE"
## [73] "GENOME_BUILD" "CHR"
## [75] "MAPINFO" "SOURCESEQ"
## [77] "CHROMOSOME_36" "COORDINATE_36"
## [79] "STRAND" "PROBE_SNPS"
## [81] "PROBE_SNPS_10" "RANDOM_LOCI"
## [83] "METHYL27_LOCI" "UCSC_REFGENE_NAME"
## [85] "UCSC_REFGENE_ACCESSION" "UCSC_REFGENE_GROUP"
## [87] "UCSC_CPG_ISLANDS_NAME" "RELATION_TO_UCSC_CPG_ISLAND"
## [89] "PHANTOM" "DMR"
## [91] "ENHANCER" "HMM_ISLAND"
## [93] "REGULATORY_FEATURE_NAME" "REGULATORY_FEATURE_GROUP"
## [95] "DHS"
PROJECT.NA.noXY.noSNP<-PROJECT.NA.noXY[substring(fData(PROJECT.NA.noXY)[,5], 1,2)!="rs",] #removes all SNP probes
PROJECT.NA.noXY.noSNP #110 samples, 453,182 probes
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 451806 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (451806 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
##PROJECT.2.filt
fvarLabels(PROJECT.2.filt.noXY)
## [1] "MAPINF0-1" "MAPINFO+1"
## [3] "Probe_start" "Probe_end"
## [5] "SNPCpG" "n_SNPCpG"
## [7] "SNPprobe" "n_SNPprobe"
## [9] "HIL_CpG_class" "HIL_CpG_Island_Name"
## [11] "n_bp_repetitive" "AlleleA_Hits"
## [13] "AlleleB_Hits" "XY_Hits"
## [15] "Autosomal_Hits" "Closest_TSS"
## [17] "Closest_TSS_1" "Distance_closest_TSS"
## [19] "Closest_TSS_gene_name" "Closest_TSS_Transcript"
## [21] "firstexon5UTRnHits" "firstexon5UTRAccessions"
## [23] "firstexonBodynHits" "firstexonBodyAccessions"
## [25] "firstexon3UTRnHits" "firstexon3UTRAccessions"
## [27] "exon5UTRnHits" "exon5UTRAccessions"
## [29] "exonBodynHits" "exonBodyAccessions"
## [31] "exon3UTRnHits" "exon3UTRAccessions"
## [33] "intron5UTRnHits" "intron5UTRAccessions"
## [35] "intronBodynHits" "intronBodyAccessions"
## [37] "intron3UTRnHits" "intron3UTRAccessions"
## [39] "IlluminaAccession" "firstexon5UTRnHitsPerGene"
## [41] "firstexon5UTRGene" "firstexonBodynHitsPerGene"
## [43] "firstexonBodyGene" "firstexon3UTRnHitsPerGene"
## [45] "firstexon3UTRGene" "exon5UTRnHitsPerGene"
## [47] "exon5UTRGene" "exonBodynHitsPerGene"
## [49] "exonBodyGene" "exon3UTRnHitsPerGene"
## [51] "exon3UTRGene" "intron5UTRnHitsPerGene"
## [53] "intron5UTRGene" "intronBodynHitsPerGene"
## [55] "intronBodyGene" "intron3UTRnHitsPerGene"
## [57] "intron3UTRGene" "IlluminaGene"
## [59] "Index" "TargetID"
## [61] "ProbeID_A" "ProbeID_B"
## [63] "ILMNID" "NAME"
## [65] "ADDRESSA_ID" "ALLELEA_PROBESEQ"
## [67] "ADDRESSB_ID" "ALLELEB_PROBESEQ"
## [69] "INFINIUM_DESIGN_TYPE" "NEXT_BASE"
## [71] "COLOR_CHANNEL" "FORWARD_SEQUENCE"
## [73] "GENOME_BUILD" "CHR"
## [75] "MAPINFO" "SOURCESEQ"
## [77] "CHROMOSOME_36" "COORDINATE_36"
## [79] "STRAND" "PROBE_SNPS"
## [81] "PROBE_SNPS_10" "RANDOM_LOCI"
## [83] "METHYL27_LOCI" "UCSC_REFGENE_NAME"
## [85] "UCSC_REFGENE_ACCESSION" "UCSC_REFGENE_GROUP"
## [87] "UCSC_CPG_ISLANDS_NAME" "RELATION_TO_UCSC_CPG_ISLAND"
## [89] "PHANTOM" "DMR"
## [91] "ENHANCER" "HMM_ISLAND"
## [93] "REGULATORY_FEATURE_NAME" "REGULATORY_FEATURE_GROUP"
## [95] "DHS"
PROJECT.2.filt.noXY.noSNP<-PROJECT.2.filt.noXY[substring(fData(PROJECT.2.filt.noXY)[,5], 1,2)!="rs",] #removes all SNP probes
PROJECT.2.filt.noXY.noSNP #110 samples, 453,182 probes
##
## Object Information:
## MethyLumiSet (storageMode: lockedEnvironment)
## assayData: 451806 features, 110 samples
## element names: betas
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: sampleID label
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... rs9839873 (451806 total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
## Major Operation History:
## submitted finished
## 1 2016-01-13 14:35:54 2016-01-13 14:40:25
## 2 2016-11-29 17:24:26 2016-11-29 17:24:28
## 3 2016-11-29 17:24:47 2016-11-29 17:24:50
## 4 2016-11-29 17:25:34 2016-11-29 17:25:37
## 5 2016-11-29 17:26:00 2016-11-29 17:26:07
## command
## 1 methylumiR(filename = betaFile, qcfile = qcFile)
## 2 Subset of 110 samples.
## 3 Subset of 483283 features.
## 4 Subset of 472010 features.
## 5 Subset of 451806 features.
Filter Cross hybridizing probes
##PROJECT.noNA
xy_hit_index <- which(featureData(PROJECT.noNA.noXY.noSNP)$XY_Hits == "XY_NO")
PROJECT.noNA.noXY.noSNP.noCH<- PROJECT.noNA.noXY.noSNP[xy_hit_index, ]
PROJECT.noNA.noXY.noSNP.noCH #110 samples, 441093 probes
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 441093 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... ch.9.991104F (441093
## total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
##PROJECT.NA
xy_hit_index <- which(featureData(PROJECT.NA.noXY.noSNP)$XY_Hits == "XY_NO")
PROJECT.NA.noXY.noSNP.noCH<- PROJECT.NA.noXY.noSNP[xy_hit_index, ]
PROJECT.NA.noXY.noSNP.noCH #110 samples, 441093 probes
## MethyLumiM (storageMode: lockedEnvironment)
## assayData: 441093 features, 110 samples
## element names: detection, exprs, methylated, unmethylated
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: ParticipantID group ... Sentrix_Position (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... ch.9.991104F (441093
## total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
##PROJECT.2.filt
xy_hit_index <- which(featureData(PROJECT.2.filt.noXY.noSNP)$XY_Hits == "XY_NO")
PROJECT.2.filt.noXY.noSNP.noCH<- PROJECT.2.filt.noXY.noSNP[xy_hit_index, ]
PROJECT.2.filt.noXY.noSNP.noCH #110 samples, 441093 probes
##
## Object Information:
## MethyLumiSet (storageMode: lockedEnvironment)
## assayData: 441093 features, 110 samples
## element names: betas
## protocolData: none
## phenoData
## sampleNames: PL102 PL104 ... PM99 (110 total)
## varLabels: sampleID label
## varMetadata: labelDescription
## featureData
## featureNames: cg00000029 cg00000108 ... ch.9.991104F (441093
## total)
## fvarLabels: MAPINF0-1 MAPINFO+1 ... DHS (95 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
## Major Operation History:
## submitted finished
## 1 2016-01-13 14:35:54 2016-01-13 14:40:25
## 2 2016-11-29 17:24:26 2016-11-29 17:24:28
## 3 2016-11-29 17:24:47 2016-11-29 17:24:50
## 4 2016-11-29 17:25:34 2016-11-29 17:25:37
## 5 2016-11-29 17:26:00 2016-11-29 17:26:07
## 6 2016-11-29 17:26:24 2016-11-29 17:26:28
## command
## 1 methylumiR(filename = betaFile, qcfile = qcFile)
## 2 Subset of 110 samples.
## 3 Subset of 483283 features.
## 4 Subset of 472010 features.
## 5 Subset of 451806 features.
## 6 Subset of 441093 features.
Save PROJECTS
##save PROJECT.NA
PROJECT.NA<-PROJECT.NA.noXY.noSNP.noCH
##save(PROJECT.NA,file="PROJECT.NA_Jan2016.RData")
## PROJECT with no NAs in place- used in any visual
PROJECT.noNA<-PROJECT.noNA.noXY.noSNP.noCH
##save(PROJECT.noNA,file="PROJECT.noNA_Jan2016.RData")
##save PROJECT.2.filt
PROJECT.filt.2<-PROJECT.2.filt.noXY.noSNP.noCH
##save(PROJECT.2.filt,file="PROJECT.2.filt_Jan2016.RData")
sum(is.na((exprs(PROJECT.noNA))))##0
## [1] 0
sum(is.na((exprs(PROJECT.NA))))##86369
## [1] 86311
sum(is.na((betas(PROJECT.2.filt))))
## [1] 89031
Now putting in the M values from Functional Normalization into the methylumi object
setwd('Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort')
##Load functional normalization project
##load("Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort/Mset.fnorm_Jan2016.RData")
##Load the SWAN normalized project and des file
##load("PROJECT.NA_Jan2016.RData")
PROJECT<-PROJECT.NA
dim(PROJECT)
## Features Samples
## 441093 110
des<-read.csv('Design_matrix_WPR_2015.csv')
#str(des)
des$Sentrix_ID<-as.factor(des$Sentrix_ID)
##Remove samples from project that are not in MSet.fnorm
##First compare sampleNames
#sampleNames(PROJECT)
#MSet.fnorm$ParticipantID
##removing chr.ab samples from PROJECT
rm <- c("PM40","PM41","PM29","PM35","PM121","PM226","PM256","PM306r")
PROJECT<-PROJECT[, -which(sampleNames(PROJECT) %in% rm)]
dim(PROJECT)
## Features Samples
## 441093 102
#sampleNames(PROJECT)
MSet.fnorm$ParticipantID
## [1] "PL113" "PL64r2" "PL58" "PL112" "PL21r2" "PL65" "PL26"
## [8] "PL25" "PL59" "PL56" "PL104" "PL38" "PL33" "PL43"
## [15] "PL11" "PL102" "PL32" "PL96" "PL76" "PM249" "PM77"
## [22] "PM161" "PM17" "PM20" "PM122" "PM142" "PM87" "PM158"
## [29] "PM74" "PM120" "PM136" "PM306" "PM114" "PM205" "PM285"
## [36] "PM112" "PM275" "PM263" "PM153" "PM167" "PM227" "PM252"
## [43] "PM320" "PM86" "PM49" "PM97" "PM99" "PM80" "PM321"
## [50] "PM12" "PM43" "PM39" "PM116" "PL130" "PM6" "PM15"
## [57] "PM64" "PL131" "PM21" "PM67" "PM51" "PM138" "PM129"
## [64] "PM36" "PM313" "PM307" "PM123" "PM139r2" "PM30" "PL145"
## [71] "PM130" "PL72" "PM4" "PM272" "PM47" "PL86" "PM54"
## [78] "PM66" "PM32" "PM52" "PM31" "PM38" "PL135" "PM119"
## [85] "PL142" "PM58" "PM98" "PM269" "PM53" "PM71" "PM44"
## [92] "PM55" "PM46" "PM115" "PL64" "PL64r1" "PL21" "PL21r"
## [99] "PM72" "PM72r" "PM139" "PM139r1"
rownames(des)<-des$ParticipantID
#rownames(des)
all(sampleNames(PROJECT) == rownames(des)) #FALSE
## [1] FALSE
##Reorder samples
des<- des[sampleNames(PROJECT),]
all(sampleNames(PROJECT) == rownames(des)) #TRUE
## [1] TRUE
Mset.fnorm <- MSet.fnorm[,match(rownames(des), MSet.fnorm$ParticipantID)]
all(Mset.fnorm$ParticipantID == rownames(des)) #TRUE
## [1] TRUE
filtDat <- exprs(PROJECT[substring(featureNames(PROJECT), 1, 2) != "rs", ])
keepNames <- paste(as.character(des$Sentrix_ID), as.character(des$Sentrix_Position), sep = "_")
fnorm.M <- getM(MSet.fnorm)
fnorm.M.filt <- fnorm.M[rownames(filtDat), keepNames]
colnames(fnorm.M.filt) <- rownames(des)
filtDat[1:5, 1:5]
## PL102 PL104 PL11 PL112 PL113
## cg00000029 -3.13138233 -2.81453252 -2.876211 -3.173261 -2.69639505
## cg00000108 3.21550972 4.86493283 4.192221 4.452645 4.31591572
## cg00000109 3.64353376 2.82114924 2.935870 3.619880 3.69273103
## cg00000165 0.03686645 -0.02437925 1.550197 1.150884 0.00256024
## cg00000236 2.05694278 2.58129618 2.127159 2.577474 3.13964697
fnorm.M.filt[1:5, 1:5]
## PL102 PL104 PL11 PL112 PL113
## cg00000029 -3.6732976 -3.876580 -4.266179 -3.912338 -4.2786012
## cg00000108 3.4019623 4.053420 3.723608 4.504231 4.1118243
## cg00000109 2.7397347 2.252949 2.212637 2.972016 2.9067982
## cg00000165 0.1736604 -0.346731 1.433691 1.346100 -0.7182395
## cg00000236 2.1026708 2.147054 1.890507 2.781738 2.7903153
PROJECT.fun <- PROJECT[substring(featureNames(PROJECT), 1, 2) != "rs", ]
all(colnames(exprs(PROJECT.fun)) %in% colnames(fnorm.M.filt))
## [1] TRUE
##double equal sign is more stringent, if samples are not in the same order than it will be false. I changed in == in %in% to just check if all the samples in each list are the same.
all(rownames(exprs(PROJECT.fun)) == rownames(fnorm.M.filt))##TRUE
## [1] TRUE
exprs(PROJECT.fun) <- fnorm.M.filt
sum(is.na(exprs(PROJECT.fun)))##0
## [1] 0
##We didn't have a project.2 for this data- but it is only the beta value which I can later create a beta matrix with just out project.
save(PROJECT.fun, file = "Z:/ROBLAB1 coredata-databases/1 Samantha DATA Folder/PROJECTS/PE_IUGR_Array/Robinson Cohort/PROJECT.fnorm_Jan2016.RData")
dim(PROJECT.fun)
## Features Samples
## 441093 102
##yay probes are already filterd
Looking at variance due to batch in the fnorm data Heat Scree plot code written by Rachel Edgar and Sumaiya Islam
Citation:De Souza, Rebecca AG, et al. “DNA methylation profiling in human Huntington’s disease brain.” Human molecular genetics (2016): ddw076.
##Heatmap and scree plot function
### Function of association meta variable with PC (ANOVA)
heat_scree_plot<-function(Loadings, Importance, Num, Order){
adjust<-1-Importance[1]
pca_adjusted<-Importance[2:length(Importance)]/adjust
pca_df<-data.frame(adjusted_variance=pca_adjusted, PC=seq(1:length(pca_adjusted)))
scree<-ggplot(pca_df[which(pca_df$PC<Num),],aes(PC,adjusted_variance))+geom_bar(stat = "identity",color="black",fill="grey")+theme_bw()+
theme(axis.text = element_text(size =12),
axis.title = element_text(size =15),
plot.margin=unit(c(1,1.5,0.2,2.25),"cm"))+ylab("Variance")+
scale_x_continuous(breaks = seq(1,Num,1))
#### Heat
## correlate meta with PCS
## Run anova of each PC on each meta data variable
aov_PC_meta<-lapply(1:ncol(meta_categorical), function(covar) sapply(1:ncol(Loadings), function(PC) summary(aov(Loadings[,PC]~meta_categorical[,covar]))[[1]]$"Pr(>F)"[1]))
cor_PC_meta<-lapply(1:ncol(meta_continuous), function(covar) sapply(1:ncol(Loadings), function(PC) (cor.test(Loadings[,PC],as.numeric(meta_continuous[,covar]),alternative = "two.sided", method="spearman", na.action=na.omit)$p.value)))
names(aov_PC_meta)<-colnames(meta_categorical)
names(cor_PC_meta)<-colnames(meta_continuous)
aov_PC_meta<-do.call(rbind, aov_PC_meta)
cor_PC_meta<-do.call(rbind, cor_PC_meta)
aov_PC_meta<-rbind(aov_PC_meta, cor_PC_meta)
aov_PC_meta<-as.data.frame(aov_PC_meta)
#adjust
aov_PC_meta_adjust<-aov_PC_meta[,2:ncol(aov_PC_meta)]
#reshape
avo<-aov_PC_meta_adjust[,1:(Num-1)]
avo_heat_num<-apply(avo,2, as.numeric)
avo_heat<-as.data.frame(avo_heat_num)
colnames(avo_heat)<-sapply(1:(Num-1), function(x) paste("PC",x, sep=""))
avo_heat$meta<-rownames(avo)
avo_heat_melt<-melt(avo_heat, id=c("meta"))
# cluster meta data
ord <- Order
meta_var_order<-unique(avo_heat_melt$meta)[rev(ord)]
avo_heat_melt$meta <- factor(avo_heat_melt$meta, levels = meta_var_order)
# color if sig
# avo_heat_melt$Pvalue<-sapply(1:nrow(avo_heat_melt), function(x) if(avo_heat_melt$value[x]>=0.9){">=0.9"}else{
# if(avo_heat_melt$value[x]>=0.5){">=0.5"}else{
# if(avo_heat_melt$value[x]>=0.1){">=0.1"}else{"<0.1"}}})
avo_heat_melt$Pvalue<-sapply(1:nrow(avo_heat_melt), function(x) if(avo_heat_melt$value[x]<=0.001){"<=0.001"}else{
if(avo_heat_melt$value[x]<=0.01){"<=0.01"}else{
if(avo_heat_melt$value[x]<=0.05){"<=0.05"}else{">0.05"}}})
heat<-ggplot(avo_heat_melt, aes(variable,meta, fill = Pvalue)) +
geom_tile(color = "black",size=0.5) +
theme_gray(8)+scale_fill_manual(values=c("#084594","#4292c6","#9ecae1","#deebf7"))+
theme(axis.text = element_text(size =10, color="black"),
axis.text.x = element_text(),
axis.title = element_text(size =15),
legend.text = element_text(size =14),
legend.title = element_text(size =12),
legend.position = c(1, 0), legend.justification = c(1,0),
plot.margin=unit(c(0,2.25,1,1),"cm"))+
xlab("Principal Component")+ylab(NULL)
grid.arrange(scree, heat, ncol=1, heights = c(2, 4))
}
identical(rownames(des), rownames(pData(PROJECT.fun)))##Should be true
## [1] TRUE
Dat<-PROJECT.fun
## re-structure meta data: change categorical variables to factors for ANOVA and continuous variables to numeric for Spearman's correlation
#str(des)
des$group<- as.factor(des$group)
des$Sex<- as.factor(des$Sex)
des$IUGR <- as.factor(des$IUGR)
des$MA<- as.numeric(des$MA)
des$GA <- as.numeric(des$GA)
des$BW<- as.numeric(des$BW)
des$BW_SD<- as.numeric(des$BW_SD)
des$Plate<-as.factor(des$Plate)
des$Sentrix_ID <-as.factor(des$Sentrix_ID)
des$Sentrix_Position<-as.factor(des$Sentrix_Position)
for (i in 1:nrow(des)){
des$Row[i]<-paste(substr(des[i,"Sentrix_Position"], start=1, stop=3))
}
des$Row<- as.factor(des$Row)
str(des)
## 'data.frame': 102 obs. of 18 variables:
## $ ParticipantID : Factor w/ 102 levels "PL102","PL104",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ group : Factor w/ 6 levels "EOPE","IUGR",..: 4 4 4 4 4 1 1 3 3 2 ...
## $ IUGR : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 2 2 2 2 2 ...
## $ Ethnicity : Factor w/ 10 levels "Asian","Caucasian",..: NA 3 NA NA NA NA NA NA NA NA ...
## $ MA : num 33.5 35.6 35 25.4 38.7 28.3 31.8 37 28.2 36.6 ...
## $ Sex : Factor w/ 2 levels "FEMALE","MALE": 2 1 2 2 2 2 1 2 2 2 ...
## $ GA : num 33.7 32.6 33.7 28 25 32.6 33.6 36.7 37 36.6 ...
## $ BW : num 2865 1685 2495 1245 978 ...
## $ BW_SD : num 3.23 -1.14 1.68 0.86 0.66 -8.19 -3.2 -1.9 -2.78 -2.31 ...
## $ PW : int 485 386 470 218 286 150 180 345 454 264 ...
## $ F_PL : num 5.91 4.37 5.31 5.71 3.42 5.47 7.39 5.74 4.06 6.93 ...
## $ PL_length : num NA NA NA NA NA NA NA NA NA NA ...
## $ PL_breadth : num NA NA NA NA NA NA NA NA NA NA ...
## $ Lgth_Bdth : num NA NA NA NA NA NA NA NA NA NA ...
## $ Plate : Factor w/ 5 levels "WG0003252-MSA2",..: 2 2 1 3 1 2 3 4 4 4 ...
## $ Sentrix_ID : Factor w/ 29 levels "6042308143","6042308147",..: 8 4 2 17 3 12 5 20 22 24 ...
## $ Sentrix_Position: Factor w/ 12 levels "R01C01","R01C02",..: 5 3 1 12 9 10 12 10 12 12 ...
## $ Row : Factor w/ 6 levels "R01","R02","R03",..: 3 2 1 6 5 5 6 5 6 6 ...
PCA Heat Scree Plot
## PCA
PCA_full<-princomp(scale(betas(Dat), center = TRUE, scale = FALSE), cor=FALSE) # scaling is not necessary for normalized dataset
Loadings<-as.data.frame(unclass(PCA_full$loadings))
vars <- PCA_full$sdev^2
Importance<-vars/sum(vars)
adjust<-1-Importance[1]
pca_adjusted<-Importance[2:length(Importance)]/adjust
(pca_df<-data.frame(adjusted_variance=pca_adjusted, PC=seq(1:length(pca_adjusted))))
## adjusted_variance PC
## Comp.2 0.113449116 1
## Comp.3 0.076055370 2
## Comp.4 0.057013272 3
## Comp.5 0.046101144 4
## Comp.6 0.028700743 5
## Comp.7 0.025766780 6
## Comp.8 0.021841028 7
## Comp.9 0.018628370 8
## Comp.10 0.015678426 9
## Comp.11 0.015083320 10
## Comp.12 0.015000737 11
## Comp.13 0.013564206 12
## Comp.14 0.013230139 13
## Comp.15 0.012515171 14
## Comp.16 0.011503935 15
## Comp.17 0.010708329 16
## Comp.18 0.010687605 17
## Comp.19 0.010303455 18
## Comp.20 0.009965694 19
## Comp.21 0.009736338 20
## Comp.22 0.009448962 21
## Comp.23 0.009050639 22
## Comp.24 0.008844260 23
## Comp.25 0.008772987 24
## Comp.26 0.008623949 25
## Comp.27 0.008489891 26
## Comp.28 0.008268791 27
## Comp.29 0.008146922 28
## Comp.30 0.008045255 29
## Comp.31 0.007929608 30
## Comp.32 0.007643862 31
## Comp.33 0.007576485 32
## Comp.34 0.007529566 33
## Comp.35 0.007412719 34
## Comp.36 0.007370633 35
## Comp.37 0.007182711 36
## Comp.38 0.007144231 37
## Comp.39 0.007042566 38
## Comp.40 0.007006999 39
## Comp.41 0.006882285 40
## Comp.42 0.006782398 41
## Comp.43 0.006725073 42
## Comp.44 0.006652648 43
## Comp.45 0.006621915 44
## Comp.46 0.006547018 45
## Comp.47 0.006440201 46
## Comp.48 0.006408367 47
## Comp.49 0.006345469 48
## Comp.50 0.006316377 49
## Comp.51 0.006304125 50
## Comp.52 0.006130970 51
## Comp.53 0.006109661 52
## Comp.54 0.006031840 53
## Comp.55 0.005968790 54
## Comp.56 0.005908848 55
## Comp.57 0.005870033 56
## Comp.58 0.005829080 57
## Comp.59 0.005767752 58
## Comp.60 0.005728267 59
## Comp.61 0.005672915 60
## Comp.62 0.005665185 61
## Comp.63 0.005614755 62
## Comp.64 0.005549477 63
## Comp.65 0.005473583 64
## Comp.66 0.005457264 65
## Comp.67 0.005401379 66
## Comp.68 0.005394428 67
## Comp.69 0.005324909 68
## Comp.70 0.005267385 69
## Comp.71 0.005249010 70
## Comp.72 0.005212331 71
## Comp.73 0.005153902 72
## Comp.74 0.005080530 73
## Comp.75 0.005060338 74
## Comp.76 0.004971750 75
## Comp.77 0.004958544 76
## Comp.78 0.004910670 77
## Comp.79 0.004892844 78
## Comp.80 0.004886659 79
## Comp.81 0.004837621 80
## Comp.82 0.004820007 81
## Comp.83 0.004775431 82
## Comp.84 0.004765219 83
## Comp.85 0.004718784 84
## Comp.86 0.004679832 85
## Comp.87 0.004632344 86
## Comp.88 0.004600985 87
## Comp.89 0.004517030 88
## Comp.90 0.004507145 89
## Comp.91 0.004459647 90
## Comp.92 0.004422724 91
## Comp.93 0.004222366 92
## Comp.94 0.004199196 93
## Comp.95 0.004069598 94
## Comp.96 0.004016361 95
## Comp.97 0.002268523 96
## Comp.98 0.001976785 97
## Comp.99 0.001774402 98
## Comp.100 0.001616968 99
## Comp.101 0.001434610 100
## Comp.102 0.001051230 101
#save(pca_df, file="Adj_PC_variance_PostFNorm_Jan2016.txt")
meta_categorical<-des[,c("group", "Sex","Plate", "Sentrix_ID", "Row")] # input column numbers in meta that contain categorical variables
meta_continuous<-des[,c("BW", "MA", "GA")] # input column numbers in meta that contain continuous variables
meta_continuous<-data.frame(meta_continuous)
colnames(meta_categorical)<-c("Pathology","Sex", "Chip", "Row","Plate")
colnames(meta_continuous)<-c("Birth_weight", "Maternal_Age","Gestational_Age")
# Specifiy the number of PCs you want shown
Num<-20 # should be equal to the number of samples in your dataset; for large datasets, you can opt to just see the top PCs
# Designate what order you want the variables to appear (continuous variables rbinded to categorical variables in function)
Order<-c(1,2,3,4,5,6,7,8)
#Apply function on PCA results, pulls in the meta data and beta values from above
heat_scree_plot(Loadings, Importance, Num, Order)
P-value distribution post Functional normalization
#head(Des)
Des1 = model.matrix(~group+GA + Sex, data = des)
head(Des1)
## (Intercept) groupIUGR groupLOPE groupPreT groupREPLICATE groupTerm
## PL102 1 0 0 1 0 0
## PL104 1 0 0 1 0 0
## PL11 1 0 0 1 0 0
## PL112 1 0 0 1 0 0
## PL113 1 0 0 1 0 0
## PL130 1 0 0 0 0 0
## GA SexMALE
## PL102 33.7 1
## PL104 32.6 0
## PL11 33.7 1
## PL112 28.0 1
## PL113 25.0 1
## PL130 32.6 1
fit1 = lmFit(PROJECT.fun, Des1)
fit1 = eBayes(fit1)
##Using Sex for the p-value distribution as to not bias myself for the number of hits in my variable of interest
tt_sex = topTable(fit1, coef = "SexMALE", n = Inf)
qplot(tt_sex$P.Value, geom = "density", main = "Fetal Sex (Male)", xlab = "p value",ylim=c(0,3))
##Clearly still an issue, fetal sex will be but into the linear regression model